library(tidyverse)
library(GGally)
library(cowplot)
library(class)
library(caret)
library(e1071)
library(reshape2)
library(stringr)
attrition.df <- read.csv("CaseStudy2-data.csv", header = T)
str(attrition.df)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : chr "No" "No" "No" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Sales" ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : chr "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : chr "Divorced" "Single" "Single" "Married" ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "No" "No" ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
noSalary <- read.csv("CaseStudy2CompSet(NoSalary).csv", header = T)
str(noSalary)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 871 872 873 874 875 876 877 878 879 880 ...
## $ Age : int 43 33 55 36 27 39 33 21 30 51 ...
## $ Attrition : chr "No" "No" "Yes" "No" ...
## $ BusinessTravel : chr "Travel_Frequently" "Travel_Rarely" "Travel_Rarely" "Non-Travel" ...
## $ DailyRate : int 1422 461 267 1351 1302 895 750 251 1312 1405 ...
## $ Department : chr "Sales" "Research & Development" "Sales" "Research & Development" ...
## $ DistanceFromHome : int 2 13 13 9 19 5 22 10 23 11 ...
## $ Education : int 4 1 4 4 3 3 2 2 3 2 ...
## $ EducationField : chr "Life Sciences" "Life Sciences" "Marketing" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1849 995 1372 1949 1619 42 160 1279 159 1367 ...
## $ EnvironmentSatisfaction : int 1 2 1 1 4 4 3 1 1 4 ...
## $ Gender : chr "Male" "Female" "Male" "Male" ...
## $ HourlyRate : int 92 53 85 66 67 56 95 45 96 82 ...
## $ JobInvolvement : int 3 3 4 4 2 3 3 2 1 2 ...
## $ JobLevel : int 2 1 4 1 1 2 2 1 1 4 ...
## $ JobRole : chr "Sales Executive" "Research Scientist" "Sales Executive" "Laboratory Technician" ...
## $ JobSatisfaction : int 4 4 3 2 1 4 2 3 3 2 ...
## $ MaritalStatus : chr "Married" "Single" "Single" "Married" ...
## $ MonthlyRate : int 19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
## $ NumCompaniesWorked : int 1 3 6 1 1 3 0 1 1 3 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "Yes" "No" ...
## $ PercentSalaryHike : int 20 18 17 22 11 14 13 20 25 16 ...
## $ PerformanceRating : int 4 3 3 4 3 3 3 4 4 3 ...
## $ RelationshipSatisfaction: int 3 1 3 2 1 3 1 3 3 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 0 2 1 1 0 3 0 ...
## $ TotalWorkingYears : int 7 5 24 5 7 19 8 2 10 29 ...
## $ TrainingTimesLastYear : int 5 4 2 3 3 6 2 2 2 1 ...
## $ WorkLifeBalance : int 3 3 2 3 3 4 4 1 2 2 ...
## $ YearsAtCompany : int 7 3 19 5 7 1 7 2 10 5 ...
## $ YearsInCurrentRole : int 7 2 7 4 7 0 7 2 7 2 ...
## $ YearsSinceLastPromotion : int 7 0 3 0 0 0 0 2 0 0 ...
## $ YearsWithCurrManager : int 7 2 8 2 7 0 7 2 9 3 ...
# divorce contribute to attrition
bar <- ggplot(attrition.df, aes(x=MaritalStatus, fill = MaritalStatus)) +
geom_bar() +
facet_wrap(~Attrition) +
ggtitle("Attrition Dependent on Marital Status ") +
xlab("Attrition") +
ylab("Count")
# Yes Attrition
# make new data frames for married, divorced, and single
df1 <- attrition.df %>%
filter(Attrition == "Yes")
df1length <- NROW(df1)
df1married <- length(grep("Married", df1$MaritalStatus))
df1divorced <- length(grep("Divorced", df1$MaritalStatus))
df1single <- length(grep("Single", df1$MaritalStatus))
Yes_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"),
Value = c(df1married,df1divorced,df1single))
plot1 <- ggplot(Yes_df, aes(x="", y=Value, fill=MaritalStatus)) +
geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: Yes")
pie1 <- plot1 + coord_polar("y", start=0)
# No Attrition
# make new data frames for married, divorced, and single
df2 <- attrition.df %>%
filter(Attrition == "No")
df2length <- NROW(df2)
df2married <- length(grep("Married", df2$MaritalStatus))
df2divorced <- length(grep("Divorced", df2$MaritalStatus))
df2single <- length(grep("Single", df2$MaritalStatus))
No_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"),
Value = c(df2married,df2divorced,df2single))
plot2 <- ggplot(No_df, aes(x="", y=Value, fill=MaritalStatus)) +
geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: No")
pie2 <- plot2 + coord_polar("y", start=0)
piecharts <- plot_grid(pie2,pie1, ncol = 2, labels = c("B","C"))
allplots <- plot_grid(bar,piecharts, nrow = 2, labels = "A")
allplots
ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
geom_histogram() +
ggtitle("Monthly Income Based on Attrition") +
xlab("Monthly Income") +
ylab("Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
geom_histogram() +
facet_wrap(~JobRole) +
ggtitle("Monthly Income Based on Job Role") +
xlab("Monthly Income") +
ylab("Count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA1 <- ggplot(attrition.df, aes(x=RelationshipSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
#facet_wrap(~JobRole) +
ggtitle("Relationship Satisfaction") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA2 <- ggplot(attrition.df, aes(x=JobLevel, fill = Attrition)) +
geom_bar(position = "fill") +
#facet_wrap(~JobRole) +
ggtitle("Job Level") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA3 <- ggplot(attrition.df, aes(x=JobSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Job Satisfaction") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA4 <- ggplot(attrition.df, aes(x=YearsWithCurrManager, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years With Current Manager") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA5 <- ggplot(attrition.df, aes(x=YearsSinceLastPromotion, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years Since Last Promotion") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA6 <- ggplot(attrition.df, aes(x=YearsAtCompany, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years At Company") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA7 <- ggplot(attrition.df, aes(x=YearsInCurrentRole, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years in Current Role") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA8 <- ggplot(attrition.df, aes(x=NumCompaniesWorked, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Companies Worked") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA9 <- ggplot(attrition.df, aes(x=MaritalStatus, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Marital Status") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none")
EDA10 <- plot_grid(EDA1,EDA2,EDA3,EDA4,EDA5,EDA6,EDA7,EDA8,EDA9, ncol = 3, nrow = 3)
EDA10
EDA.A <- ggplot(attrition.df, aes(x=TotalWorkingYears, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Total Working Years vs Percent Salary Hike")
EDA.B <- ggplot(attrition.df, aes(x=YearsWithCurrManager, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Years With Current Manager vs Percent Salary Hike")
EDA.C <-ggplot(attrition.df, aes(x=YearsAtCompany, y=YearsInCurrentRole, color = Attrition)) +
geom_point() +
ggtitle("Years At Company vs Years In CurrentRole")
EDA.D <- ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
geom_point() +
ggtitle("Age vs Monthly Income")
EDA.E <- ggplot(attrition.df, aes(x=Age, y=YearsSinceLastPromotion, color = Attrition)) +
geom_point() +
ggtitle("Age vs Job YearsSinceLastPromotion")
EDA.F <- ggplot(attrition.df, aes(x=Age, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Age vs Percent Salary Hike")
EDA.G <- plot_grid(EDA.A,EDA.B,EDA.C,EDA.D,EDA.E,EDA.F, ncol = 2, nrow = 3)
EDA.G
ggpairs(attrition.df, columns = c(2,25,35,36,3), aes(color = Attrition))
ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
geom_point() +
facet_wrap(~JobRole)
Conclusion: This model does not have enough “Yes” attrition data to be able to use. Internal may prove more able to handle this skewed data.
set.seed(9)
splitPerc <- .85
trainIndices <- sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
dfTrain <- attrition.df[trainIndices,]
dfTest <- attrition.df[-trainIndices,]
dfTrain <- na.omit(dfTrain)
dfTest <- na.omit(dfTest)
# 2:Age, 18:JobSatisfaction, 20:MontlyIncome, 25:PercentSalaryHike,
# 30:TotalWorkingYears, 30:TotalworkingYears, 33:YearsAtCompany,
# 34:YearsInCurrentRole, 35:YearsSinceLastPromotion, 36:YearsWithCurrManager
# knn model
classifications <- knn(dfTrain[,c(2,35)], dfTest[,c(2,35)], dfTrain$Attrition,
prob = TRUE, k = 10)
table(dfTest$Attrition,classifications)
## classifications
## No Yes
## No 106 0
## Yes 22 2
confusionMatrix(table(dfTest$Attrition,classifications))
## Confusion Matrix and Statistics
##
## classifications
## No Yes
## No 106 0
## Yes 22 2
##
## Accuracy : 0.8308
## 95% CI : (0.7551, 0.8908)
## No Information Rate : 0.9846
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1291
##
## Mcnemar's Test P-Value : 7.562e-06
##
## Sensitivity : 0.82812
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.08333
## Prevalence : 0.98462
## Detection Rate : 0.81538
## Detection Prevalence : 0.81538
## Balanced Accuracy : 0.91406
##
## 'Positive' Class : No
##
# Loop for many k and the average of many training / test partition
iterations = 100
numks = 50
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(100), k = numeric(100))
trainIndices = sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
train = attrition.df[trainIndices,]
test = attrition.df[-trainIndices,]
train = na.omit(train)
test = na.omit(test)
for(i in 1:numks)
{
classifications = knn(train[,c(2,35)],test[,c(2,35)],train$Attrition, prob = TRUE, k = i)
table(classifications,test$Attrition)
CM = confusionMatrix(table(classifications,test$Attrition))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l", xlab = "k", ylab = "Mean Accuracy", main = "Mean Accuracy of k Values")
# Internal Model
classifications1 <- knn.cv(dfTrain[,c(2,35)],dfTrain$Attrition, k = 20)
confusionMatrix(table(classifications1,dfTrain$Attrition))
## Confusion Matrix and Statistics
##
##
## classifications1 No Yes
## No 618 107
## Yes 6 9
##
## Accuracy : 0.8473
## 95% CI : (0.8193, 0.8725)
## No Information Rate : 0.8432
## P-Value [Acc > NIR] : 0.4044
##
## Kappa : 0.1053
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99038
## Specificity : 0.07759
## Pos Pred Value : 0.85241
## Neg Pred Value : 0.60000
## Prevalence : 0.84324
## Detection Rate : 0.83514
## Detection Prevalence : 0.97973
## Balanced Accuracy : 0.53399
##
## 'Positive' Class : No
##
NBmodel <- naiveBayes(dfTrain[,c(24,20)],dfTrain$Attrition,laplace = 1)
table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition)
##
## No Yes
## No 106 24
## Yes 0 0
confusionMatrix(table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 106 24
## Yes 0 0
##
## Accuracy : 0.8154
## 95% CI : (0.7379, 0.878)
## No Information Rate : 0.8154
## P-Value [Acc > NIR] : 0.5543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 2.668e-06
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8154
## Neg Pred Value : NaN
## Prevalence : 0.8154
## Detection Rate : 0.8154
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
NBmodel1 = naiveBayes(Attrition~.,data = dfTrain)
table(predict(NBmodel1,dfTest))
##
## No Yes
## 102 28
confusionMatrix(table(predict(NBmodel1,dfTest),dfTest$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 96 6
## Yes 10 18
##
## Accuracy : 0.8769
## 95% CI : (0.8078, 0.928)
## No Information Rate : 0.8154
## P-Value [Acc > NIR] : 0.0401
##
## Kappa : 0.616
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.9057
## Specificity : 0.7500
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.6429
## Prevalence : 0.8154
## Detection Rate : 0.7385
## Detection Prevalence : 0.7846
## Balanced Accuracy : 0.8278
##
## 'Positive' Class : No
##
# Make new dataframe and make train and test sets
set.seed(9)
new.df <- attrition.df[,c(2,3,4,5,6,7,15,16,18,20,29,30,32,33,34,35)]
splitPerc <- .7
trainIndices <- sample(1:dim(new.df)[1],round(splitPerc * dim(new.df)[1]))
dfTrain1 <- new.df[trainIndices,]
dfTest1 <- new.df[-trainIndices,]
dfTrain1 <- na.omit(dfTrain1)
dfTest1 <- na.omit(dfTest1)
# Naive Bayes Model for New Dataframe
new.dfmodel = naiveBayes(Attrition~.,data = dfTrain1)
confusionMatrix(table(predict(new.dfmodel,dfTest1),dfTest1$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 206 20
## Yes 17 18
##
## Accuracy : 0.8582
## 95% CI : (0.8099, 0.8982)
## No Information Rate : 0.8544
## P-Value [Acc > NIR] : 0.4733
##
## Kappa : 0.4109
##
## Mcnemar's Test P-Value : 0.7423
##
## Sensitivity : 0.9238
## Specificity : 0.4737
## Pos Pred Value : 0.9115
## Neg Pred Value : 0.5143
## Prevalence : 0.8544
## Detection Rate : 0.7893
## Detection Prevalence : 0.8659
## Balanced Accuracy : 0.6987
##
## 'Positive' Class : No
##
Not sure if this is actually correct to use for this kind of data
# change attrition variable to numberic for use in linear regression model
attrition.lm <- attrition.df
attrition.lm$Attrition <- gsub("Yes", 1, attrition.lm$Attrition)
attrition.lm$Attrition <- gsub("No", 0, attrition.lm$Attrition)
attrition.lm$Attrition <- as.numeric(attrition.lm$Attrition)
# run model
attritionfit = lm(Attrition~Age*MonthlyIncome, data = attrition.lm)
attritionfit1 = lm(Attrition~Age+MonthlyIncome, data = attrition.lm)
attritionfit2 = lm(Attrition~MonthlyIncome, data = attrition.lm)
attritionfit3 = lm(Attrition~Age, data = attrition.lm)
# inormation about model
summary(attritionfit)
##
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = attrition.lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37829 -0.19156 -0.13330 -0.06808 1.02288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.202e-01 8.883e-02 6.981 5.83e-12 ***
## Age -1.030e-02 2.353e-03 -4.379 1.34e-05 ***
## MonthlyIncome -5.761e-05 1.400e-05 -4.114 4.26e-05 ***
## Age:MonthlyIncome 1.130e-06 3.153e-07 3.583 0.000358 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3599 on 866 degrees of freedom
## Multiple R-squared: 0.04538, Adjusted R-squared: 0.04207
## F-statistic: 13.72 on 3 and 866 DF, p-value: 9.485e-09
confint(attritionfit)
## 2.5 % 97.5 %
## (Intercept) 4.457982e-01 7.945094e-01
## Age -1.491937e-02 -5.684270e-03
## MonthlyIncome -8.508807e-05 -3.012455e-05
## Age:MonthlyIncome 5.109013e-07 1.748437e-06
attritionfit$coefficients
## (Intercept) Age MonthlyIncome Age:MonthlyIncome
## 6.201538e-01 -1.030182e-02 -5.760631e-05 1.129669e-06
hist(attritionfit$residuals, col = "blue", main = "Histogram of Residuals")
sqrt(mean(attritionfit$residuals^2))
## [1] 0.3590231
sum((attritionfit$residuals)^2)
## [1] 112.1409
set.seed(9)
splitPerc <- .8
trainIndices <- sample(1:dim(attrition.lm)[1],round(splitPerc * dim(attrition.lm)[1]))
dfTrain2 <- attrition.lm[trainIndices,]
dfTest2 <- attrition.lm[-trainIndices,]
dfTrain2 <- na.omit(dfTrain2)
dfTest2 <- na.omit(dfTest2)
# Best LM model for Attrition
LR.fit <- lm(Attrition~Age*MonthlyIncome, data = dfTrain2)
summary(LR.fit)
##
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = dfTrain2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38770 -0.19212 -0.13098 -0.06281 0.97511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.504e-01 9.709e-02 6.698 4.37e-11 ***
## Age -1.093e-02 2.578e-03 -4.238 2.56e-05 ***
## MonthlyIncome -6.789e-05 1.542e-05 -4.403 1.24e-05 ***
## Age:MonthlyIncome 1.351e-06 3.464e-07 3.900 0.000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3576 on 692 degrees of freedom
## Multiple R-squared: 0.05128, Adjusted R-squared: 0.04716
## F-statistic: 12.47 on 3 and 692 DF, p-value: 6.013e-08
LR.Preds <- predict(LR.fit, newdata = dfTest2)
as.data.frame(LR.Preds)
## LR.Preds
## 4 0.0436343376
## 7 0.1757354301
## 8 0.1261548668
## 11 -0.0096785475
## 20 0.1270006629
## 23 0.1247949152
## 25 0.1277521340
## 33 0.0506196216
## 40 0.2390046050
## 47 0.1345714009
## 55 0.1965221637
## 56 0.1737091788
## 58 0.1978393508
## 62 0.1203358821
## 66 0.2193424035
## 81 0.1604895283
## 83 0.1470060521
## 84 0.1390505143
## 88 0.1907521927
## 92 0.1228033569
## 93 0.2086153683
## 99 0.2421487801
## 100 0.2064850283
## 101 0.2893754885
## 104 0.2952330668
## 105 0.1326403819
## 125 0.2026924251
## 129 0.1404653443
## 132 0.1072503453
## 133 0.1409489462
## 152 0.1675968220
## 157 0.1345922483
## 160 0.0442400680
## 161 0.1822631461
## 169 0.0952463419
## 176 0.3390744220
## 178 0.2993354296
## 185 0.1670835191
## 186 0.2821553040
## 188 0.0664447806
## 198 0.0925916183
## 206 0.1479196979
## 216 0.1807685996
## 220 0.2720391955
## 235 0.0141801220
## 236 0.0560347834
## 238 0.1447577332
## 240 0.0993334890
## 251 0.2429442292
## 254 0.2347121354
## 255 0.0226159823
## 256 0.1565770557
## 257 0.2390566349
## 260 0.2414836826
## 264 0.2715256733
## 271 0.1220118722
## 275 0.1692231504
## 281 0.1035349089
## 283 0.1230065851
## 285 0.1880249177
## 286 0.1683642833
## 288 0.2203874771
## 298 0.1454284601
## 335 0.2281072295
## 338 0.1084870401
## 346 0.0212883864
## 347 0.1902545233
## 366 0.1408643103
## 370 0.1760073288
## 380 0.2641960260
## 392 0.2300298652
## 394 0.1297804955
## 403 0.1275064367
## 405 0.2151568837
## 420 -0.0092634384
## 425 0.2023571148
## 427 0.2876757891
## 446 0.1217159934
## 453 0.1480852077
## 456 -0.0615879524
## 461 0.1680604569
## 463 0.1007597374
## 466 0.1698090307
## 470 0.0876810399
## 480 -0.0001150151
## 482 0.2503211157
## 492 0.0980652835
## 493 0.1649303386
## 496 0.1093654833
## 497 0.1397739409
## 498 0.2043884617
## 499 0.1561042012
## 505 0.0611739359
## 508 0.1036572356
## 510 -0.0421241154
## 511 0.1466417344
## 515 0.3177324288
## 518 0.1605517957
## 526 0.0990615121
## 530 0.0769086639
## 531 0.0890132918
## 533 0.2559326596
## 540 0.1338977221
## 548 0.1426717240
## 580 0.1432781612
## 591 0.3025435343
## 592 0.1778235817
## 594 0.1094560472
## 605 0.2542096746
## 607 0.0702584011
## 608 0.1423030413
## 613 0.2266480798
## 615 0.1237552980
## 621 0.3122184932
## 623 0.2067925074
## 626 0.2762428213
## 627 0.2404771972
## 629 0.1196300431
## 630 0.0894892230
## 631 0.0664003641
## 632 0.0616623626
## 642 0.1516242819
## 644 0.2281623063
## 647 0.3707016773
## 649 0.1790239249
## 673 0.1971254325
## 685 0.2628221331
## 693 0.1157564397
## 694 0.2600090752
## 696 0.1222708191
## 702 0.1533350660
## 708 0.2567468125
## 710 0.0976986242
## 713 0.1180365042
## 721 0.0770797638
## 722 0.1427401023
## 724 0.0863588681
## 732 0.1445283155
## 743 0.1693015541
## 745 0.2097527218
## 746 0.3066642860
## 752 0.2517259239
## 754 0.1056173402
## 757 0.1851663217
## 763 0.1254043467
## 768 0.0617661000
## 772 0.0875552102
## 774 0.1917184296
## 775 0.0784585878
## 777 0.2660568930
## 779 0.2820336150
## 781 0.1630651052
## 782 0.1549465990
## 787 0.1061760211
## 792 0.2280744551
## 796 0.2117041144
## 802 0.1403818648
## 803 -0.0353962014
## 807 0.1645446326
## 813 0.2370011335
## 818 0.0171994289
## 820 0.0481126874
## 822 0.1344335052
## 826 0.0347670672
## 836 0.0878261367
## 837 0.1923883452
## 842 0.1040802539
## 846 0.2073482600
## 848 0.0779881835
## 851 0.1930101432
## 854 0.1741842048
## 861 0.0959029745
## 863 0.1623398117
## 869 0.1268218404
# Metrics for model
MSPE <- data.frame(Observed = dfTest2$Attrition, Predicted = LR.Preds)
MSPE$Resisdual <- MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual <- MSPE$Resisdual^2
MSPE
## Observed Predicted Resisdual SquaredResidual
## 4 0 0.0436343376 -0.0436343376 1.903955e-03
## 7 0 0.1757354301 -0.1757354301 3.088294e-02
## 8 0 0.1261548668 -0.1261548668 1.591505e-02
## 11 0 -0.0096785475 0.0096785475 9.367428e-05
## 20 0 0.1270006629 -0.1270006629 1.612917e-02
## 23 0 0.1247949152 -0.1247949152 1.557377e-02
## 25 0 0.1277521340 -0.1277521340 1.632061e-02
## 33 0 0.0506196216 -0.0506196216 2.562346e-03
## 40 0 0.2390046050 -0.2390046050 5.712320e-02
## 47 1 0.1345714009 0.8654285991 7.489667e-01
## 55 0 0.1965221637 -0.1965221637 3.862096e-02
## 56 0 0.1737091788 -0.1737091788 3.017488e-02
## 58 0 0.1978393508 -0.1978393508 3.914041e-02
## 62 0 0.1203358821 -0.1203358821 1.448072e-02
## 66 0 0.2193424035 -0.2193424035 4.811109e-02
## 81 1 0.1604895283 0.8395104717 7.047778e-01
## 83 0 0.1470060521 -0.1470060521 2.161078e-02
## 84 0 0.1390505143 -0.1390505143 1.933505e-02
## 88 1 0.1907521927 0.8092478073 6.548820e-01
## 92 1 0.1228033569 0.8771966431 7.694740e-01
## 93 1 0.2086153683 0.7913846317 6.262896e-01
## 99 0 0.2421487801 -0.2421487801 5.863603e-02
## 100 0 0.2064850283 -0.2064850283 4.263607e-02
## 101 0 0.2893754885 -0.2893754885 8.373817e-02
## 104 0 0.2952330668 -0.2952330668 8.716256e-02
## 105 0 0.1326403819 -0.1326403819 1.759347e-02
## 125 0 0.2026924251 -0.2026924251 4.108422e-02
## 129 0 0.1404653443 -0.1404653443 1.973051e-02
## 132 0 0.1072503453 -0.1072503453 1.150264e-02
## 133 0 0.1409489462 -0.1409489462 1.986661e-02
## 152 0 0.1675968220 -0.1675968220 2.808869e-02
## 157 0 0.1345922483 -0.1345922483 1.811507e-02
## 160 0 0.0442400680 -0.0442400680 1.957184e-03
## 161 0 0.1822631461 -0.1822631461 3.321985e-02
## 169 0 0.0952463419 -0.0952463419 9.071866e-03
## 176 0 0.3390744220 -0.3390744220 1.149715e-01
## 178 1 0.2993354296 0.7006645704 4.909308e-01
## 185 0 0.1670835191 -0.1670835191 2.791690e-02
## 186 0 0.2821553040 -0.2821553040 7.961162e-02
## 188 0 0.0664447806 -0.0664447806 4.414909e-03
## 198 0 0.0925916183 -0.0925916183 8.573208e-03
## 206 0 0.1479196979 -0.1479196979 2.188024e-02
## 216 0 0.1807685996 -0.1807685996 3.267729e-02
## 220 0 0.2720391955 -0.2720391955 7.400532e-02
## 235 0 0.0141801220 -0.0141801220 2.010759e-04
## 236 1 0.0560347834 0.9439652166 8.910703e-01
## 238 0 0.1447577332 -0.1447577332 2.095480e-02
## 240 0 0.0993334890 -0.0993334890 9.867142e-03
## 251 0 0.2429442292 -0.2429442292 5.902190e-02
## 254 1 0.2347121354 0.7652878646 5.856655e-01
## 255 0 0.0226159823 -0.0226159823 5.114827e-04
## 256 0 0.1565770557 -0.1565770557 2.451637e-02
## 257 0 0.2390566349 -0.2390566349 5.714807e-02
## 260 0 0.2414836826 -0.2414836826 5.831437e-02
## 264 0 0.2715256733 -0.2715256733 7.372619e-02
## 271 0 0.1220118722 -0.1220118722 1.488690e-02
## 275 0 0.1692231504 -0.1692231504 2.863647e-02
## 281 0 0.1035349089 -0.1035349089 1.071948e-02
## 283 0 0.1230065851 -0.1230065851 1.513062e-02
## 285 0 0.1880249177 -0.1880249177 3.535337e-02
## 286 0 0.1683642833 -0.1683642833 2.834653e-02
## 288 0 0.2203874771 -0.2203874771 4.857064e-02
## 298 1 0.1454284601 0.8545715399 7.302925e-01
## 335 0 0.2281072295 -0.2281072295 5.203291e-02
## 338 0 0.1084870401 -0.1084870401 1.176944e-02
## 346 0 0.0212883864 -0.0212883864 4.531954e-04
## 347 0 0.1902545233 -0.1902545233 3.619678e-02
## 366 0 0.1408643103 -0.1408643103 1.984275e-02
## 370 0 0.1760073288 -0.1760073288 3.097858e-02
## 380 1 0.2641960260 0.7358039740 5.414075e-01
## 392 0 0.2300298652 -0.2300298652 5.291374e-02
## 394 0 0.1297804955 -0.1297804955 1.684298e-02
## 403 0 0.1275064367 -0.1275064367 1.625789e-02
## 405 0 0.2151568837 -0.2151568837 4.629248e-02
## 420 0 -0.0092634384 0.0092634384 8.581129e-05
## 425 0 0.2023571148 -0.2023571148 4.094840e-02
## 427 0 0.2876757891 -0.2876757891 8.275736e-02
## 446 0 0.1217159934 -0.1217159934 1.481478e-02
## 453 1 0.1480852077 0.8519147923 7.257588e-01
## 456 0 -0.0615879524 0.0615879524 3.793076e-03
## 461 0 0.1680604569 -0.1680604569 2.824432e-02
## 463 0 0.1007597374 -0.1007597374 1.015252e-02
## 466 0 0.1698090307 -0.1698090307 2.883511e-02
## 470 0 0.0876810399 -0.0876810399 7.687965e-03
## 480 0 -0.0001150151 0.0001150151 1.322847e-08
## 482 0 0.2503211157 -0.2503211157 6.266066e-02
## 492 0 0.0980652835 -0.0980652835 9.616800e-03
## 493 0 0.1649303386 -0.1649303386 2.720202e-02
## 496 0 0.1093654833 -0.1093654833 1.196081e-02
## 497 0 0.1397739409 -0.1397739409 1.953675e-02
## 498 0 0.2043884617 -0.2043884617 4.177464e-02
## 499 0 0.1561042012 -0.1561042012 2.436852e-02
## 505 0 0.0611739359 -0.0611739359 3.742250e-03
## 508 0 0.1036572356 -0.1036572356 1.074482e-02
## 510 1 -0.0421241154 1.0421241154 1.086023e+00
## 511 0 0.1466417344 -0.1466417344 2.150380e-02
## 515 0 0.3177324288 -0.3177324288 1.009539e-01
## 518 0 0.1605517957 -0.1605517957 2.577688e-02
## 526 0 0.0990615121 -0.0990615121 9.813183e-03
## 530 1 0.0769086639 0.9230913361 8.520976e-01
## 531 0 0.0890132918 -0.0890132918 7.923366e-03
## 533 0 0.2559326596 -0.2559326596 6.550153e-02
## 540 0 0.1338977221 -0.1338977221 1.792860e-02
## 548 0 0.1426717240 -0.1426717240 2.035522e-02
## 580 0 0.1432781612 -0.1432781612 2.052863e-02
## 591 0 0.3025435343 -0.3025435343 9.153259e-02
## 592 1 0.1778235817 0.8221764183 6.759741e-01
## 594 0 0.1094560472 -0.1094560472 1.198063e-02
## 605 1 0.2542096746 0.7457903254 5.562032e-01
## 607 0 0.0702584011 -0.0702584011 4.936243e-03
## 608 0 0.1423030413 -0.1423030413 2.025016e-02
## 613 1 0.2266480798 0.7733519202 5.980732e-01
## 615 0 0.1237552980 -0.1237552980 1.531537e-02
## 621 1 0.3122184932 0.6877815068 4.730434e-01
## 623 0 0.2067925074 -0.2067925074 4.276314e-02
## 626 0 0.2762428213 -0.2762428213 7.631010e-02
## 627 0 0.2404771972 -0.2404771972 5.782928e-02
## 629 0 0.1196300431 -0.1196300431 1.431135e-02
## 630 0 0.0894892230 -0.0894892230 8.008321e-03
## 631 0 0.0664003641 -0.0664003641 4.409008e-03
## 632 0 0.0616623626 -0.0616623626 3.802247e-03
## 642 0 0.1516242819 -0.1516242819 2.298992e-02
## 644 1 0.2281623063 0.7718376937 5.957334e-01
## 647 1 0.3707016773 0.6292983227 3.960164e-01
## 649 0 0.1790239249 -0.1790239249 3.204957e-02
## 673 0 0.1971254325 -0.1971254325 3.885844e-02
## 685 0 0.2628221331 -0.2628221331 6.907547e-02
## 693 0 0.1157564397 -0.1157564397 1.339955e-02
## 694 1 0.2600090752 0.7399909248 5.475866e-01
## 696 0 0.1222708191 -0.1222708191 1.495015e-02
## 702 0 0.1533350660 -0.1533350660 2.351164e-02
## 708 0 0.2567468125 -0.2567468125 6.591893e-02
## 710 0 0.0976986242 -0.0976986242 9.545021e-03
## 713 0 0.1180365042 -0.1180365042 1.393262e-02
## 721 0 0.0770797638 -0.0770797638 5.941290e-03
## 722 0 0.1427401023 -0.1427401023 2.037474e-02
## 724 0 0.0863588681 -0.0863588681 7.457854e-03
## 732 1 0.1445283155 0.8554716845 7.318318e-01
## 743 0 0.1693015541 -0.1693015541 2.866302e-02
## 745 0 0.2097527218 -0.2097527218 4.399620e-02
## 746 0 0.3066642860 -0.3066642860 9.404298e-02
## 752 1 0.2517259239 0.7482740761 5.599141e-01
## 754 1 0.1056173402 0.8943826598 7.999203e-01
## 757 0 0.1851663217 -0.1851663217 3.428657e-02
## 763 0 0.1254043467 -0.1254043467 1.572625e-02
## 768 0 0.0617661000 -0.0617661000 3.815051e-03
## 772 0 0.0875552102 -0.0875552102 7.665915e-03
## 774 1 0.1917184296 0.8082815704 6.533191e-01
## 775 0 0.0784585878 -0.0784585878 6.155750e-03
## 777 0 0.2660568930 -0.2660568930 7.078627e-02
## 779 0 0.2820336150 -0.2820336150 7.954296e-02
## 781 1 0.1630651052 0.8369348948 7.004600e-01
## 782 0 0.1549465990 -0.1549465990 2.400845e-02
## 787 0 0.1061760211 -0.1061760211 1.127335e-02
## 792 0 0.2280744551 -0.2280744551 5.201796e-02
## 796 1 0.2117041144 0.7882958856 6.214104e-01
## 802 0 0.1403818648 -0.1403818648 1.970707e-02
## 803 0 -0.0353962014 0.0353962014 1.252891e-03
## 807 0 0.1645446326 -0.1645446326 2.707494e-02
## 813 0 0.2370011335 -0.2370011335 5.616954e-02
## 818 0 0.0171994289 -0.0171994289 2.958204e-04
## 820 1 0.0481126874 0.9518873126 9.060895e-01
## 822 0 0.1344335052 -0.1344335052 1.807237e-02
## 826 0 0.0347670672 -0.0347670672 1.208749e-03
## 836 0 0.0878261367 -0.0878261367 7.713430e-03
## 837 0 0.1923883452 -0.1923883452 3.701328e-02
## 842 0 0.1040802539 -0.1040802539 1.083270e-02
## 846 0 0.2073482600 -0.2073482600 4.299330e-02
## 848 0 0.0779881835 -0.0779881835 6.082157e-03
## 851 1 0.1930101432 0.8069898568 6.512326e-01
## 854 0 0.1741842048 -0.1741842048 3.034014e-02
## 861 0 0.0959029745 -0.0959029745 9.197381e-03
## 863 1 0.1623398117 0.8376601883 7.016746e-01
## 869 0 0.1268218404 -0.1268218404 1.608378e-02
mean(MSPE$SquaredResidual)
## [1] 0.1362915
RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
RMSE
## [1] 0.3691768
attrition.jl <- attrition.df
attrition.jl$JobLevel <- gsub(1, "1", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(2, "2", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(3, "3", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(4, "4", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(5, "5", attrition.jl$JobLevel)
attrition.jl$JobLevel <- as.character(attrition.jl$JobLevel)
# Job level model
JLModel_fit = lm(MonthlyIncome~JobLevel, data = attrition.jl)
summary(JLModel_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.jl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4642.2 -668.0 -107.3 668.3 4412.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2743.82 69.69 39.37 <2e-16 ***
## JobLevel2 2800.46 99.89 28.04 <2e-16 ***
## JobLevel3 7108.38 130.24 54.58 <2e-16 ***
## JobLevel4 12509.83 177.45 70.50 <2e-16 ***
## JobLevel5 16480.15 219.18 75.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9244
## F-statistic: 2658 on 4 and 865 DF, p-value: < 2.2e-16
preds <- predict(JLModel_fit)
hist(JLModel_fit$residuals, col = "blue", main = "Histogram of Residuals")
plot(JLModel_fit$fitted.values,JLModel_fit$residuals, main = "Plot of Residuals v. Fitted Values")
attrition.jl %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attrition.jl, aes(x = JobLevel, y = preds, col = "red"))
Model1_fit = lm(MonthlyIncome~JobLevel, data = attrition.df)
summary(Model1_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5037.1 -928.2 80.1 697.1 3723.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1793.93 101.68 -17.64 <2e-16 ***
## JobLevel 4013.67 43.98 91.26 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1413 on 868 degrees of freedom
## Multiple R-squared: 0.9056, Adjusted R-squared: 0.9055
## F-statistic: 8329 on 1 and 868 DF, p-value: < 2.2e-16
hist(Model1_fit$residuals, col = "blue", main = "Histogram of Residuals")
SalaryPredict <- predict(Model1_fit, noSalary, interval = "confidence")
SalaryPredict <- as.data.frame(SalaryPredict)
noSalary$MonthlyIncome <- SalaryPredict[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + geom_point() + geom_line(data = noSalary, aes(x = JobLevel, y = MonthlyIncome, col = "red"))
# Good Exponential Model
attriton.df2 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2))
fit = lm(MonthlyIncome~JobLevel+JobLevel2, attriton.df2)
summary(fit)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2, data = attriton.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4502.5 -744.2 -131.4 656.1 4177.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 299.75 190.65 1.572 0.116
## JobLevel 1944.17 169.12 11.496 <2e-16 ***
## JobLevel2 397.80 31.56 12.603 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1300 on 867 degrees of freedom
## Multiple R-squared: 0.9202, Adjusted R-squared: 0.92
## F-statistic: 5001 on 2 and 867 DF, p-value: < 2.2e-16
preds2 <- predict(fit)
hist(fit$residuals, col = "blue", main = "Histogram of Residuals")
plot(fit$fitted.values,fit$residuals, main = "Plot of Residuals v. Fitted Values")
attriton.df2 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) + geom_point() + geom_line(data = attriton.df2, aes(x = JobLevel, y = preds, col = "red"))
noSalary <- noSalary %>% mutate(JobLevel2 = (JobLevel^2))
test2 <- predict(fit, noSalary, interval = "confidence")
test2 <- as.data.frame(test2)
noSalary$Salary <- test2[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = Salary)) + geom_point() + geom_line(data = noSalary, aes(x = JobLevel, y = Salary , col = "red"))
# Adding extra exponential variable helped
attriton.df3 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2),
JobLevel3 = (JobLevel^3))
fit2 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3, attriton.df3)
summary(fit2)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3,
## data = attriton.df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4796.8 -669.9 -108.7 652.2 4456.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3012.60 435.69 6.915 9.11e-12 ***
## JobLevel -2176.07 620.83 -3.505 0.00048 ***
## JobLevel2 2125.20 252.82 8.406 < 2e-16 ***
## JobLevel3 -207.57 30.15 -6.884 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1267 on 866 degrees of freedom
## Multiple R-squared: 0.9244, Adjusted R-squared: 0.9241
## F-statistic: 3528 on 3 and 866 DF, p-value: < 2.2e-16
preds2 <- predict(fit2)
hist(fit2$residuals, col = "blue", main = "Histogram of Residuals")
plot(fit2$fitted.values,fit2$residuals, main = "Plot of Residuals v. Fitted Values")
attriton.df3 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attriton.df3, aes(x = JobLevel, y = preds2, col = "red"))
noSalary <- noSalary %>% mutate(JobLevel3 = (JobLevel^3))
test3 <- predict(fit2, noSalary, interval = "confidence")
test3 <- as.data.frame(test3)
noSalary$Salary2 <- test3[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = Salary2)) +
geom_point() +
geom_line(data = noSalary, aes(x = JobLevel, y = Salary2 , col = "red"))
# Adding extra exponential variable hurt
attriton.df4 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2),
JobLevel3 = (JobLevel^3),
JobLevel4 = (JobLevel^4))
fit3 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3+JobLevel4, attriton.df4)
summary(fit3)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3 +
## JobLevel4, data = attriton.df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4642.2 -668.0 -107.3 668.3 4412.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -245.90 1597.80 -0.154 0.8777
## JobLevel 4177.61 3061.29 1.365 0.1727
## JobLevel2 -1910.41 1920.81 -0.995 0.3202
## JobLevel3 810.46 481.29 1.684 0.0926 .
## JobLevel4 -87.95 41.50 -2.119 0.0343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9244
## F-statistic: 2658 on 4 and 865 DF, p-value: < 2.2e-16
preds3 <- predict(fit3)
hist(fit3$residuals, col = "blue", main = "Histogram of Residuals")
plot(fit3$fitted.values,fit3$residuals, main = "Plot of Residuals v. Fitted Values")
attriton.df4 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attriton.df4, aes(x = JobLevel, y = preds3, col = "red"))
# best !!!
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole, data = dfTrain)
summary(Model2_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole, data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3597.1 -707.3 -13.4 634.5 4018.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -179.200 277.614 -0.646 0.51881
## Age 10.725 5.265 2.037 0.04200 *
## JobLevel 2988.030 76.238 39.193 < 2e-16 ***
## JobRoleHuman Resources -502.938 285.061 -1.764 0.07810 .
## JobRoleLaboratory Technician -803.863 191.210 -4.204 2.95e-05 ***
## JobRoleManager 3823.428 254.520 15.022 < 2e-16 ***
## JobRoleManufacturing Director -22.335 184.840 -0.121 0.90386
## JobRoleResearch Director 3852.704 242.579 15.882 < 2e-16 ***
## JobRoleResearch Scientist -474.371 191.176 -2.481 0.01331 *
## JobRoleSales Executive -261.845 164.079 -1.596 0.11096
## JobRoleSales Representative -684.793 238.703 -2.869 0.00424 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1099 on 729 degrees of freedom
## Multiple R-squared: 0.9446, Adjusted R-squared: 0.9439
## F-statistic: 1244 on 10 and 729 DF, p-value: < 2.2e-16
preds <- predict(Model2_fit)
hist(Model2_fit$residuals, col = "blue", main = "Histogram of Residuals")
sqrt(sum((Model1_fit$residuals)^2))
## [1] 41638.29
sqrt(sum((Model2_fit$residuals)^2))
## [1] 29679.15
# better
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole+JobSatisfaction, data = dfTrain)
summary(Model2_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole + JobSatisfaction,
## data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3575.5 -690.8 -26.4 647.2 4011.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -252.502 295.590 -0.854 0.39326
## Age 10.622 5.269 2.016 0.04416 *
## JobLevel 2988.926 76.273 39.187 < 2e-16 ***
## JobRoleHuman Resources -498.394 285.223 -1.747 0.08099 .
## JobRoleLaboratory Technician -800.257 191.337 -4.182 3.24e-05 ***
## JobRoleManager 3830.739 254.803 15.034 < 2e-16 ***
## JobRoleManufacturing Director -19.484 184.943 -0.105 0.91612
## JobRoleResearch Director 3859.614 242.846 15.893 < 2e-16 ***
## JobRoleResearch Scientist -473.397 191.243 -2.475 0.01354 *
## JobRoleSales Executive -259.069 164.177 -1.578 0.11500
## JobRoleSales Representative -678.904 238.919 -2.842 0.00461 **
## JobSatisfaction 26.447 36.535 0.724 0.46937
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1100 on 728 degrees of freedom
## Multiple R-squared: 0.9447, Adjusted R-squared: 0.9438
## F-statistic: 1130 on 11 and 728 DF, p-value: < 2.2e-16
Model2_Preds = predict(Model2_fit, newdata = dfTest)
as.data.frame(Model2_Preds)
## Model2_Preds
## 4 8900.911
## 7 2777.885
## 8 5965.097
## 11 19058.970
## 20 6219.181
## 23 5461.196
## 33 16075.464
## 40 2492.601
## 55 6140.947
## 56 2398.131
## 58 2270.663
## 62 9176.022
## 66 2655.836
## 81 6171.272
## 83 6188.857
## 88 2360.844
## 93 2408.535
## 99 2297.110
## 100 2350.221
## 101 2249.418
## 104 2533.788
## 132 6092.565
## 133 6277.278
## 152 5911.985
## 157 5944.070
## 160 13046.477
## 161 5349.552
## 169 5830.763
## 176 2538.991
## 178 2565.655
## 185 2429.780
## 206 2756.858
## 216 2767.262
## 220 2644.996
## 235 16096.491
## 236 8927.576
## 240 13131.455
## 254 2593.770
## 255 13051.679
## 264 2471.357
## 271 6023.629
## 275 19188.866
## 283 6256.033
## 286 5354.972
## 288 2403.115
## 298 5365.594
## 346 11884.853
## 347 6146.367
## 380 2661.039
## 392 5848.251
## 394 9118.996
## 403 2514.759
## 405 2661.256
## 420 12961.280
## 425 5790.155
## 427 2661.039
## 446 2868.066
## 453 5943.852
## 461 2461.647
## 463 9092.114
## 470 6295.298
## 482 2698.108
## 492 16120.382
## 493 5853.671
## 498 2761.842
## 505 16141.408
## 508 9049.842
## 511 5975.719
## 515 2386.378
## 518 2435.200
## 526 9071.087
## 530 2910.773
## 531 9197.267
## 533 2650.416
## 540 5938.868
## 580 6139.187
## 592 5880.336
## 594 2483.110
## 605 2418.462
## 607 6172.342
## 608 5896.378
## 613 2313.152
## 615 8853.220
## 621 2254.402
## 623 2729.975
## 626 2629.172
## 627 5800.559
## 629 9331.916
## 630 9128.331
## 631 2953.045
## 632 8922.156
## 642 6176.692
## 644 2650.634
## 647 2206.711
## 649 5864.511
## 693 6277.278
## 694 2687.486
## 696 2662.559
## 702 5906.783
## 708 2620.217
## 710 16091.506
## 721 9167.161
## 732 5959.895
## 743 5750.768
## 745 5811.182
## 752 2439.707
## 757 6202.921
## 763 5954.692
## 768 8906.549
## 772 8970.283
## 774 2724.773
## 777 2636.042
## 779 2318.137
## 781 2545.931
## 782 6218.963
## 787 13166.097
## 796 2640.012
## 802 5901.363
## 803 15974.660
## 807 5811.182
## 813 5800.559
## 818 13003.769
## 826 19146.159
## 836 8901.129
## 842 8550.378
## 846 5248.966
## 851 5874.916
## 854 6049.223
## 863 5933.230
## 869 2820.374
MSPE = data.frame(Observed = dfTest$MonthlyIncome, Predicted = Model2_Preds)
MSPE$Resisdual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Resisdual^2
MSPE
## Observed Predicted Resisdual SquaredResidual
## 4 10422 8900.911 1521.08860 2313710.5166
## 7 2127 2777.885 -650.88466 423650.8448
## 8 6694 5965.097 728.90293 531299.4768
## 11 19392 19058.970 333.03011 110909.0536
## 20 5033 6219.181 -1186.18116 1407025.7379
## 23 5679 5461.196 217.80438 47438.7490
## 33 16872 16075.464 796.53617 634469.8712
## 40 2791 2492.601 298.39878 89041.8349
## 55 4424 6140.947 -1716.94731 2947908.0496
## 56 2289 2398.131 -109.13076 11909.5233
## 58 4723 2270.663 2452.33749 6013959.1618
## 62 7094 9176.022 -2082.02247 4334817.5616
## 66 3298 2655.836 642.16355 412374.0219
## 81 4777 6171.272 -1394.27170 1943993.5702
## 83 5321 6188.857 -867.85676 753175.3626
## 88 3743 2360.844 1382.15634 1910356.1523
## 93 2090 2408.535 -318.53539 101464.7941
## 99 3180 2297.110 882.89047 779495.5776
## 100 3294 2350.221 943.77870 890718.2267
## 101 2099 2249.418 -150.41780 22625.5151
## 104 2819 2533.788 285.21176 81345.7472
## 132 4507 6092.565 -1585.56533 2514017.4002
## 133 4523 6277.278 -1754.27751 3077489.5972
## 152 5396 5911.985 -515.98530 266240.8313
## 157 5677 5944.070 -267.07009 71326.4342
## 160 13191 13046.477 144.52343 20887.0210
## 161 5126 5349.552 -223.55203 49975.5116
## 169 5577 5830.763 -253.76262 64395.4672
## 176 2070 2538.991 -468.99055 219952.1404
## 178 2042 2565.655 -523.65530 274214.8779
## 185 3755 2429.780 1325.21990 1756207.7894
## 206 2559 2756.858 -197.85768 39147.6625
## 216 2345 2767.262 -422.26231 178305.4575
## 220 2875 2644.996 230.00363 52901.6697
## 235 16959 16096.491 862.50919 743922.1026
## 236 10609 8927.576 1681.42385 2827186.1504
## 240 13269 13131.455 137.54459 18918.5149
## 254 2956 2593.770 362.22970 131210.3553
## 255 13757 13051.679 705.32111 497477.8737
## 264 2157 2471.357 -314.35651 98820.0131
## 271 5154 6023.629 -869.62889 756254.3992
## 275 18722 19188.866 -466.86641 217964.2414
## 283 6513 6256.033 256.96719 66032.1390
## 286 5207 5354.972 -147.97207 21895.7348
## 288 2661 2403.115 257.88465 66504.4938
## 298 6074 5365.594 708.40557 501838.4534
## 346 13341 11884.853 1456.14708 2120364.3146
## 347 4033 6146.367 -2113.36735 4466321.5414
## 380 2132 2661.039 -529.03877 279882.0155
## 392 4157 5848.251 -1691.25118 2860330.5385
## 394 10266 9118.996 1147.00374 1315617.5802
## 403 2105 2514.759 -409.75893 167902.3827
## 405 2559 2661.256 -102.25649 10456.3905
## 420 11935 12961.280 -1026.28001 1053250.6597
## 425 4724 5790.155 -1066.15482 1136686.0969
## 427 1274 2661.039 -1387.03877 1923876.5372
## 446 3420 2868.066 551.93419 304631.3488
## 453 5813 5943.852 -130.85236 17122.3413
## 461 2093 2461.647 -368.64716 135900.7291
## 463 9208 9092.114 115.88622 13429.6155
## 470 4450 6295.298 -1845.29804 3405124.8382
## 482 2356 2698.108 -342.10814 117037.9807
## 492 16880 16120.382 759.61849 577020.2501
## 493 5869 5853.671 15.32878 234.9716
## 498 2326 2761.842 -435.84227 189958.4824
## 505 17169 16141.408 1027.59151 1055944.3094
## 508 10596 9049.842 1546.15791 2390604.2746
## 511 5343 5975.719 -632.71943 400333.8739
## 515 2610 2386.378 223.62233 50006.9457
## 518 2766 2435.200 330.79986 109428.5481
## 526 7525 9071.087 -1546.08680 2390384.3971
## 530 4963 2910.773 2052.22704 4211635.8388
## 531 8823 9197.267 -374.26718 140075.9203
## 533 2700 2650.416 49.58359 2458.5323
## 540 5155 5938.868 -783.86778 614448.6951
## 580 6623 6139.187 483.81309 234075.1080
## 592 4599 5880.336 -1281.33597 1641821.8586
## 594 3211 2483.110 727.89040 529824.4384
## 605 2760 2418.462 341.53754 116647.8895
## 607 5473 6172.342 -699.34185 489079.0183
## 608 5605 5896.378 -291.37836 84901.3497
## 613 2707 2313.152 393.84807 155116.3039
## 615 7264 8853.220 -1589.21967 2525619.1708
## 621 2926 2254.402 671.59761 451043.3534
## 623 3280 2729.975 550.02480 302527.2753
## 626 2517 2629.172 -112.17170 12582.4909
## 627 4162 5800.559 -1638.55944 2684877.0545
## 629 10976 9331.916 1644.08413 2703012.6323
## 630 8621 9128.331 -507.33074 257384.4782
## 631 2662 2953.045 -291.04465 84706.9858
## 632 9888 8922.156 965.84389 932854.4145
## 642 4448 6176.692 -1728.69174 2988375.1318
## 644 2307 2650.634 -343.63414 118084.4216
## 647 1904 2206.711 -302.71066 91633.7417
## 649 4312 5864.511 -1552.51130 2410291.3328
## 693 6781 6277.278 503.72249 253736.3426
## 694 2285 2687.486 -402.48579 161994.8091
## 696 3294 2662.559 631.44112 398717.8826
## 702 5714 5906.783 -192.78299 37165.2806
## 708 2109 2620.217 -511.21732 261343.1506
## 710 17924 16091.506 1832.49378 3358033.4363
## 721 9434 9167.161 266.83949 71203.3123
## 732 5238 5959.895 -721.89476 521132.0444
## 743 4285 5750.768 -1465.76837 2148476.9158
## 745 4907 5811.182 -904.18180 817544.7261
## 752 2302 2439.707 -137.70717 18963.2650
## 757 4014 6202.921 -2188.92103 4791375.2930
## 763 6151 5954.692 196.30755 38536.6555
## 768 10932 8906.549 2025.45083 4102451.0523
## 772 10453 8970.283 1482.71670 2198448.8153
## 774 3388 2724.773 663.22711 439870.1974
## 777 2064 2636.042 -572.04199 327232.0381
## 779 2570 2318.137 251.86349 63435.2159
## 781 3140 2545.931 594.06929 352918.3159
## 782 4553 6218.963 -1665.96343 2775434.1481
## 787 13116 13166.097 -50.09652 2509.6613
## 796 3348 2640.012 707.98822 501247.3129
## 802 6582 5901.363 680.63705 463266.7977
## 803 17068 15974.660 1093.33967 1195391.6403
## 807 6232 5811.182 420.81820 177087.9581
## 813 4260 5800.559 -1540.55944 2373323.4033
## 818 11691 13003.769 -1312.76943 1723363.5703
## 826 19627 19146.159 480.84074 231207.8161
## 836 8834 8901.129 -67.12913 4506.3204
## 842 7988 8550.378 -562.37780 316268.7865
## 846 4558 5248.966 -690.96626 477434.3710
## 851 4559 5874.916 -1315.91593 1731634.7223
## 854 5661 6049.223 -388.22349 150717.4766
## 863 5304 5933.230 -629.23001 395930.4057
## 869 4477 2820.374 1656.62592 2744409.4370
mean(MSPE$SquaredResidual)
## [1] 970738.6
RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
RMSE
## [1] 985.2607